Variable Selection_step()

변수 선택법(Variable Selection)
1. 전진선택법(Forward Selection)

> yard<-c(31, 31, 27, 39, 30, 32, 28, 23, 28, 35)

> area<-c(58, 51, 47, 35, 48, 42, 43, 56, 41, 41)

> park<-c(1, 1, 5, 5, 2, 4, 5, 1, 1, 3)

> dist<-c(492, 426, 400, 125, 443, 412, 201, 362, 192, 423)

> popul<-c(4412, 2061, 4407, 1933, 4029, 4180, 3444, 1683, 3020, 4459)

> price<-c(12631, 12084, 12220, 15649, 11486, 12276, 15527, 12666, 13180, 10169)

> result<-step(lm(price ~ 1), scope=list(lower-1, upper=~yard+aread+park+dist+popul), direction='forward')


> result<-step(lm(price ~ 1), scope=list(lower=~1, upper=~yard+area+park+dist+popul), direction='forward')

Start:  AIC=149.52

price ~ 1


        Df Sum of Sq      RSS    AIC

+ dist   1  16958139  8557243 140.60

+ popul  1   5431481 20083900 149.13

+ park   1   4895399 20619982 149.39

<none>               25515382 149.52

+ area   1   2806386 22708996 150.36

+ yard   1    282704 25232677 151.41


Step:  AIC=140.6

price ~ dist


        Df Sum of Sq     RSS    AIC

+ area   1   2214900 6342343 139.60

<none>               8557243 140.60

+ park   1    376540 8180703 142.15

+ popul  1     90527 8466716 142.49

+ yard   1     53104 8504139 142.53


Step:  AIC=139.6

price ~ dist + area


        Df Sum of Sq     RSS    AIC

+ park   1   2922548 3419795 135.43

<none>               6342343 139.60

+ yard   1    975693 5366650 139.93

+ popul  1    326295 6016048 141.07


Step:  AIC=135.43

price ~ dist + area + park


        Df Sum of Sq     RSS    AIC

+ yard   1   1338046 2081748 132.46

<none>               3419795 135.43

+ popul  1       879 3418916 137.42


Step:  AIC=132.46

price ~ dist + area + park + yard


        Df Sum of Sq     RSS    AIC

<none>               2081748 132.46

+ popul  1     54218 2027530 134.20




> summary(result)


Call:

lm(formula = price ~ dist + area + park + yard)


Residuals:

     1      2      3      4      5      6      7      8      9     10 

 211.9  193.4 -451.5 -193.6  247.8  801.9  387.0 -486.6  100.3 -810.6 


Coefficients:

            Estimate Std. Error t value Pr(>|t|)   

(Intercept) 3045.689   4084.218   0.746  0.48939   

dist         -16.446      2.489  -6.609  0.00119 **

area         230.563     61.193   3.768  0.01305 * 

park         436.801    155.508   2.809  0.03760 * 

yard         117.922     65.779   1.793  0.13300   

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


Residual standard error: 645.3 on 5 degrees of freedom

Multiple R-squared:  0.9184, Adjusted R-squared:  0.8531 

F-statistic: 14.07 on 4 and 5 DF,  p-value: 0.006267

2. 후진소거법(Backward Elimination)

> yard<-c(31, 31, 27, 39, 30, 32, 28, 23, 28, 35)

> area<-c(58, 51, 47, 35, 48, 42, 43, 56, 41, 41)

> park<-c(1, 1, 5, 5, 2, 4, 5, 1, 1, 3)

> dist<-c(492, 426, 400, 125, 443, 412, 201, 362, 192, 423)

> popul<-c(4412, 2061, 4407, 1933, 4029, 4180, 3444, 1683, 3020, 4459)

> price<-c(12631, 12084, 12220, 15649, 11486, 12276, 15527, 12666, 13180, 10169)

> result<-step(lm(price~1), scope=list(lower=~1, upper=~yard+area+park+dist+popul), direction='both')

Start:  AIC=149.52

price ~ 1


        Df Sum of Sq      RSS    AIC

+ dist   1  16958139  8557243 140.60

+ popul  1   5431481 20083900 149.13

+ park   1   4895399 20619982 149.39

<none>               25515382 149.52

+ area   1   2806386 22708996 150.36

+ yard   1    282704 25232677 151.41


Step:  AIC=140.6

price ~ dist


        Df Sum of Sq      RSS    AIC

+ area   1   2214900  6342343 139.60

<none>                8557243 140.60

+ park   1    376540  8180703 142.15

+ popul  1     90527  8466716 142.49

+ yard   1     53104  8504139 142.53

- dist   1  16958139 25515382 149.52


Step:  AIC=139.6

price ~ dist + area


        Df Sum of Sq      RSS    AIC

+ park   1   2922548  3419795 135.43

<none>                6342343 139.60

+ yard   1    975693  5366650 139.93

- area   1   2214900  8557243 140.60

+ popul  1    326295  6016048 141.07

- dist   1  16366653 22708996 150.36


Step:  AIC=135.43

price ~ dist + area + park


        Df Sum of Sq      RSS    AIC

+ yard   1   1338046  2081748 132.46

<none>                3419795 135.43

+ popul  1       879  3418916 137.42

- park   1   2922548  6342343 139.60

- area   1   4760908  8180703 142.15

- dist   1  17088473 20508268 151.34


Step:  AIC=132.46

price ~ dist + area + park + yard


        Df Sum of Sq      RSS    AIC

<none>                2081748 132.46

+ popul  1     54218  2027530 134.20

- yard   1   1338046  3419795 135.43

- park   1   3284902  5366650 139.93

- area   1   5910682  7992431 143.91

- dist   1  18183500 20265249 153.22




> summary(result)


Call:

lm(formula = price ~ dist + area + park + yard)


Residuals:

     1      2      3      4      5      6      7      8      9     10 

 211.9  193.4 -451.5 -193.6  247.8  801.9  387.0 -486.6  100.3 -810.6 


Coefficients:

            Estimate Std. Error t value Pr(>|t|)   

(Intercept) 3045.689   4084.218   0.746  0.48939   

dist         -16.446      2.489  -6.609  0.00119 **

area         230.563     61.193   3.768  0.01305 * 

park         436.801    155.508   2.809  0.03760 * 

yard         117.922     65.779   1.793  0.13300   

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


Residual standard error: 645.3 on 5 degrees of freedom

Multiple R-squared:  0.9184, Adjusted R-squared:  0.8531 

F-statistic: 14.07 on 4 and 5 DF,  p-value: 0.006267

3. 단계적 방법(stepwise method)

> yard<-c(31, 31, 27, 39, 30, 32, 28, 23, 28, 35)

> area<-c(58, 51, 47, 35, 48, 42, 43, 56, 41, 41)

> park<-c(1, 1, 5, 5, 2, 4, 5, 1, 1, 3)

> dist<-c(492, 426, 400, 125, 443, 412, 201, 362, 192, 423)

> popul<-c(4412, 2061, 4407, 1933, 4029, 4180, 3444, 1683, 3020, 4459)

> price<-c(12631, 12084, 12220, 15649, 11486, 12276, 15527, 12666, 13180, 10169)

> result<-step(lm(price~1), scope=list(lower=~1, upper=~yard+area+park+dist+popul), direction='both')

Start:  AIC=149.52

price ~ 1


        Df Sum of Sq      RSS    AIC

+ dist   1  16958139  8557243 140.60

+ popul  1   5431481 20083900 149.13

+ park   1   4895399 20619982 149.39

<none>               25515382 149.52

+ area   1   2806386 22708996 150.36

+ yard   1    282704 25232677 151.41


Step:  AIC=140.6

price ~ dist


        Df Sum of Sq      RSS    AIC

+ area   1   2214900  6342343 139.60

<none>                8557243 140.60

+ park   1    376540  8180703 142.15

+ popul  1     90527  8466716 142.49

+ yard   1     53104  8504139 142.53

- dist   1  16958139 25515382 149.52


Step:  AIC=139.6

price ~ dist + area


        Df Sum of Sq      RSS    AIC

+ park   1   2922548  3419795 135.43

<none>                6342343 139.60

+ yard   1    975693  5366650 139.93

- area   1   2214900  8557243 140.60

+ popul  1    326295  6016048 141.07

- dist   1  16366653 22708996 150.36


Step:  AIC=135.43

price ~ dist + area + park


        Df Sum of Sq      RSS    AIC

+ yard   1   1338046  2081748 132.46

<none>                3419795 135.43

+ popul  1       879  3418916 137.42

- park   1   2922548  6342343 139.60

- area   1   4760908  8180703 142.15

- dist   1  17088473 20508268 151.34


Step:  AIC=132.46

price ~ dist + area + park + yard


        Df Sum of Sq      RSS    AIC

<none>                2081748 132.46

+ popul  1     54218  2027530 134.20

- yard   1   1338046  3419795 135.43

- park   1   3284902  5366650 139.93

- area   1   5910682  7992431 143.91

- dist   1  18183500 20265249 153.22




> summary(result)


Call:

lm(formula = price ~ dist + area + park + yard)


Residuals:

     1      2      3      4      5      6      7      8      9     10 

 211.9  193.4 -451.5 -193.6  247.8  801.9  387.0 -486.6  100.3 -810.6 


Coefficients:

            Estimate Std. Error t value Pr(>|t|)   

(Intercept) 3045.689   4084.218   0.746  0.48939   

dist         -16.446      2.489  -6.609  0.00119 **

area         230.563     61.193   3.768  0.01305 * 

park         436.801    155.508   2.809  0.03760 * 

yard         117.922     65.779   1.793  0.13300   

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


Residual standard error: 645.3 on 5 degrees of freedom

Multiple R-squared:  0.9184, Adjusted R-squared:  0.8531 

F-statistic: 14.07 on 4 and 5 DF,  p-value: 0.006267